######################################################
###### Merging Pew and Harris Data ###################
######################################################

## Purpose: This script pulls together the Harris 
## and Pew data into a single file. We also harmonize
## the variable levels across this file. 
## Data In: 
## 1) Harmonized Pew dataset:
## data_files/Pew_data/p2012valmer.por
## Follow instructions in README to download
## 2) Individual study Harris study files:
## all .rds files in the folder "Harris_Data" 
## Data Out:
## 1) data_files/data_merged_4_29_2025.rds
## 2) figures/income_imputation
## 3) figures/AJS/age_imputation
## Software Versions: 

## R version 4.1.2
library(haven) ## 2.5.4
library(tidyverse) ## 2.0.0
library(VGAM) ## 1.1.11
library(acs) ## 2.1.4
## Note: the ACS package has now been archived
## to download this package you can install
## from this link:
## https://cran.r-project.org/src/contrib/Archive/acs/acs_2.1.4.tar.gz

## set working directory to replication file


#### this file has three sections:
### 1) Takes the Pew data and transforms 
## it into a format they can be merged with the Harris
## data
### 2) Takes each individual Harris poll, which 
## were previously individually subsetted and harmonized
## (as each poll was different).
## 3) Harmonizes the different values of variables across
## the different surveys. 
## To recreate each of the Harris rds files,
## you would need to follow the instructions in the
## README to download the Harris data and use the provided
## R scripts. 

######################################################
######### Harmonizing Pew Data ##################
######################################################

##### Download this file using the link
## provided in the README - this is the harmonized
## PEW data file

pewdata <- read_spss("data_files/p2012valmer.por")

## Study variable
pewdata$study <- paste0("Pew", pewdata$YEAR)

## year
pewdata$year <- pewdata$YEAR

## urban
pewdata$urban <- as.character(haven::as_factor(pewdata$CITYSIZE))
pewdata$urban[pewdata$year == "2007"] <- as.character(
  haven::as_factor(pewdata$USR[pewdata$year == "2007"])
)


## region
pewdata$region <- as.character(haven::as_factor(pewdata$CREGION))


## household
pewdata$hh <- NA

## inequality variable
pewdata$inequality <- as.character(haven::as_factor(pewdata$Q16Q))
pewdata$inequality.variable <- 5

## union membership - collapsed - will combine
## information below
pewdata$union.self <- as.character(haven::as_factor(pewdata$LABORHH))
pewdata$union.other <- as.character(haven::as_factor(pewdata$LABORHH))

## employment
pewdata$employed <- NA

pewdata$employed.self <- as.character(haven::as_factor(pewdata$EMPLOY))
pewdata$occupation <- NA  
pewdata$occupation.self <- NA
pewdata$hhsize <- NA

## education
pewdata$educ <- NA
pewdata$educ[pewdata$year %in%
                 c("1987", "1988", 
                   "1989", "1990",
                   "1991", "1992",
                   "1993")] <- as.character(haven::as_factor(pewdata$EDUC87[pewdata$year %in%
                                                                                c("1987", "1988", 
                                                                                  "1989", "1990",
                                                                                  "1991", "1992",
                                                                                  "1993")]))
pewdata$educ[pewdata$year %in%
               c("1994", "1997", 
                 "1999", "2002",
                 "2003", "2007",
                 "2009")] <- as.character(haven::as_factor(pewdata$EDUC94[pewdata$year %in%
                                                                            c("1994", "1997", 
                                                                              "1999", "2002",
                                                                              "2003", "2007",
                                                                              "2009")]))

pewdata$educ[pewdata$year %in%
               c("2012")] <- as.character(haven::as_factor(pewdata$EDUC12[pewdata$year %in%
                                                                            c("2012")]))

## income
pewdata$income <- NA
pewdata$income[pewdata$year %in%
                 c("1987", "1988",
                   "1990")] <- as.character(haven::as_factor(pewdata$INCOME87[pewdata$year %in%
                                                                                c("1987", "1988",
                                                                                  "1990")]))
pewdata$income[pewdata$year %in%
                 c("1989")] <- as.character(haven::as_factor(pewdata$INCOME89[pewdata$year %in%
                                                                                c("1989")]))
pewdata$income[pewdata$year %in%
                 c("1991", "1992",
                   "1993")] <- as.character(haven::as_factor(pewdata$INCOME91[pewdata$year %in%
                                                                                c("1991", "1992",
                                                                                  "1993")]))
pewdata$income[pewdata$year %in%
                 c("1994", "1997",
                   "1999", "2002",
                   "2003")] <- as.character(haven::as_factor(pewdata$INCOME94[pewdata$year %in%
                                                                                c("1994", "1997",
                                                                                  "1999", "2002",
                                                                                  "2003")]))
pewdata$income[pewdata$year %in%
                 c("2007", "2009",
                   "2012")] <- as.character(haven::as_factor(pewdata$INCOME07[pewdata$year %in%
                                                                                c("2007", "2009",
                                                                                  "2012")]))

## age
pewdata$age <- as.character(haven::as_factor(pewdata$AGE))

## race
pewdata$race <- as.character(haven::as_factor(pewdata$RACE))
pewdata$HISP <- as.character(haven::as_factor(pewdata$HISP))

pewdata$race <- ifelse(pewdata$race == "DK/Ref",
                       "Don't know/refused",
                       ifelse(pewdata$race == "White",
                              ifelse(pewdata$HISP == "DK/Ref",
                                     "Don't know/refused",
                                     ifelse(pewdata$HISP == "No",
                                            "Non-Hispanic White",
                                            "Hispanic White")),
                              "Non-White"))


## party
pewdata$party <- as.character(haven::as_factor(pewdata$PARTY))
pewdata$party[pewdata$party == "Independent" &
                pewdata$year == "1987"] <- "Independent/Other/Don't Know/Refused"
## in 1987 these were grouped together
## for those who responded independent, no preference
## other, don't know/refused, were asked if they lean more Republican or Democrat:
pewdata$PARTYLN <- as.character(haven::as_factor(pewdata$PARTYLN))

## ideology
pewdata$ideology <- NA
pewdata$ideology[pewdata$year %in% c("1987", "1993")] <-
  as.character(as_factor(pewdata$IDEO87[pewdata$year %in% c("1987", "1993")]))
pewdata$ideology[pewdata$year %in% c("2002", "2003", "2007",
                                     "2009", "2012")] <-
  as.character(as_factor(pewdata$IDEO94[pewdata$year %in% c("2002", "2003", "2007",
                                                            "2009", "2012")]))


## gender
pewdata$gender <- as.character(as_factor(pewdata$SEX))

## religion
pewdata$religion <- as.character(as_factor(pewdata$RELIG))


## factual questions
pewdata$factual1 <- as.character(as_factor(pewdata$FOLGOV))
pewdata$factual2 <- as.character(as_factor(pewdata$OFTVOTE))
pewdata$factual3 <- NA


## alienation questions
pewdata$dontcare <- NA
pewdata$dontcount <- NA
pewdata$leftout <- NA
pewdata$pew.govtcare <- as.character(as_factor(pewdata$Q2C))
pewdata$pew.nosay <- as.character(as_factor(pewdata$Q2A))
pewdata$pew.finsat <- as.character(as_factor(pewdata$Q16V))

## additional variables
pewdata$state <- as.character(as_factor(pewdata$STATE))


## statements about racial inequality
pewdata$racial_gains <- as.character(as_factor(pewdata$Q10J))
pewdata$racial_discrimination <- as.character(as_factor(pewdata$Q10M))


pewdata <- pewdata[,c("study", "year", "urban", "region", "hh",
                          "inequality", "inequality.variable", "union.self", "union.other",
                          "employed", "employed.self", "occupation", "occupation.self", "hhsize", "educ", "income", 
                          "age", "race", "party", "ideology", "gender", "religion",
                          "factual1", "factual2", "factual3", "dontcare", "dontcount",
                          "leftout", "pew.nosay", "pew.govtcare", "pew.finsat", 
                          "WEIGHT", "state", "PARTYLN", "RESPID", "racial_gains", "racial_discrimination")]

#### adding in question place data to pew
## "after party", "before party", "no party"
## as with Harris data, we coded question placement
## based on questions about voting intentions and party identification
## we didn't consider questions asking respondents to evaluate 
## candidates/politicians
pewdata$question_place <- NA
pewdata$question_place[pewdata$year == "1987"] <- "after party"
pewdata$question_place[pewdata$year == "1988"] <- "after party"
pewdata$question_place[pewdata$year == "1989"] <- "questionnaire missing"
pewdata$question_place[pewdata$year == "1990"] <- "after party"
pewdata$question_place[pewdata$year == "1991"] <- "after party"
pewdata$question_place[pewdata$year == "1992"] <- "after party" 
pewdata$question_place[pewdata$year == "1993"] <- "no inequality question"
pewdata$question_place[pewdata$year == "1994"] <- "after party"
pewdata$question_place[pewdata$year == "1997"] <- "questionnaire missing"
pewdata$question_place[pewdata$year == "1999"] <- "after party"
pewdata$question_place[pewdata$year == "2002"] <- "after party"
pewdata$question_place[pewdata$year == "2003"] <- "after party"
pewdata$question_place[pewdata$year == "2007"] <- "questionnaire missing"
pewdata$question_place[pewdata$year == "2009"] <- "after party"
pewdata$question_place[pewdata$year == "2012"] <- "after party"

pewdata$inequality.mid.rich <- NA
pewdata$inequality.mid.poor <- NA


###############################################
######## Harris Data ##########################
###############################################

### we include all Harris Polls which included the question 
### "the rich get richer"
### See README for details on how we constructed these
### files and how to download data to recreate them.
 
harris_files <- list.files("Harris_Data")
harris_files <- harris_files[grepl(".rds", harris_files)]
harris_files <- paste0("Harris_Data/", harris_files)

harris <- lapply(harris_files, function(x){
  frame <- readRDS(x)
  return(frame)
})

dataframe <- do.call("rbind", harris)

## adding variables which are not present
dataframe$pew.nosay <- NA
dataframe$pew.govtcare <- NA
dataframe$pew.finsat <- NA
dataframe$WEIGHT <- NA
dataframe$state <- NA
dataframe$PARTYLN <- NA
dataframe$RESPID <- NA
dataframe$inequality.mid.rich <- NA
dataframe$inequality.mid.poor <- NA
dataframe$racial_gains <- NA
dataframe$racial_discrimination <- NA


dataframe <- dataframe[,c("study", "year", "urban", "region", "hh",
                          "inequality", "inequality.variable", "union.self", "union.other",
                          "employed", "employed.self", "occupation", "occupation.self", "hhsize", "educ", "income", 
                          "age", "race", "party", "ideology", "gender", "religion",
                          "factual1", "factual2", "factual3", "dontcare", "dontcount",
                          "leftout", "pew.nosay", "pew.govtcare", "pew.finsat",
                           "WEIGHT", "state", "PARTYLN", "RESPID", "racial_gains",
                           "racial_discrimination", "question_place", "inequality.mid.rich",
                           "inequality.mid.poor")]


dataframe <- bind_rows(dataframe, pewdata)


##########################################################
######## Harmonizing Pew and Harris Variables ############
##########################################################

## In this section we harmonize the 
## values of the variables
## Downstream we do additional harmonizing 
## by setting values to NA and creating 
## composite and transformed variables

## study
dataframe$study <- factor(dataframe$study)
pew.studies <- unique(dataframe$study)
pew.studies <- pew.studies[grepl("Pew", pew.studies)]

## urban
dataframe$urban <- dplyr::recode(dataframe$urban,
                                 `Central city` = "Urban",
                                 `Central City` = "Urban",
                                 `Rest of metro area` = "Suburban",
                                 `Small town` = "Rural",
                                 `Suburb` = "Suburban",
                                 `Town` = "Rural",
                                 `A large city` = "Urban",
                                 `A small city or town` = "Urban",
                                 `Sm.city/Town` = "Urban",
                                 `Large city` = "Urban",
                                 `A suburb near large city` = "Suburban",
                                 `Rural area` = "Rural",
                                 `SUBURBAN` = "Suburban",
                                 `URBAN` = "Urban",
                                 `RURAL` = "Rural",
                                 `Surburb` = "Suburban")

## region
dataframe$region <- dplyr::recode(dataframe$region,
                                  `East_(1)` = "East",
                                  `East_(2)` = "East",
                                  `Midwest_(5)` = "Midwest",
                                  `Midwest_(6)` = "Midwest",
                                  `South_(3)` = "South",
                                  `South_(4)` = "South",
                                  `West_(7)` = "West",
                                  `West_(8)` =  "West",
                                  `Northeast` = "East",
                                  `Mid-Atlantic` = "East",
                                  `New England (All)` = "East",
                                  `South-Central` = "South",
                                  `South-East` = "South",
                                  `Mountain` = "West",
                                  `Plains` = "Midwest")


## whether respondent head of house
## for some surveys we have details on position in household 
## for some surveys we don't - just yes or not whether they are the head of household
dataframe$hh <- dplyr::recode(dataframe$hh,
                              `Daughter` = "No",
                              `Female head (no male head)` = "Yes",
                              `Male head` = "Yes",
                              `Wf r fml hd f hshld (wf, mthr, sngl fml)` = "Yes", ## problematic category noted in the harmonizing guide 
                              `Both` = "Yes",
                              `Female head` = "Yes",
                              `Female head (no male head` = "Yes",
                              `FEMALE HEAD OF HOUSEHOLD` = "Yes",
                              `Female head of household (no male head)` = "Yes",
                              `Male head of household` = "Yes",
                              `MALE HEAD OF HOUSEHOLD` = "Yes",
                              `Ml hd f hshld (hsbnd, fthr, sngl ml)` = "Yes",
                              #`No answer/Refused` = "NA",
                              #`Not sure` = "NA",
                              `Other` = "No",
                              `Other (specify)` = "No",
                              `Other female` = "No",
                              `Other female adult over 18` = "No",
                              `Other is head` = "No",
                              `Other male` = "No",
                              `Other male adult over 18` = "No",
                              `Respondent is head` = "Yes",
                              `Son` = "No",
                              `Teenage boy 13 to 18 years old` = "No",
                              `Teenage girl 13 to 18 years old` = "No",
                              `Wf r fml hd f hshld (wf, mthr, sngl fml` = "Yes", ## this is the problematic category noted in the codebook
                              `WIFE OF HEAD OF HOUSEHOLD` = "No",
                              `Wife` = "No",
                              `Wife of head of household` = "No",
                              `No answer/refused` = "No answer/Refused")

## Inequality Perception Variable
## b/c it's central to our analysis, here we create two versions:
## 1) inequality_harm - harmonized version across all surveys
## 2) inequality - original values 

dataframe$inequality_harm <- dplyr::recode(dataframe$inequality,
                                       `Don t Feel` = "Don't Feel",
                                       `Not sure` = "Not Sure",
                                       `Don t feel` = "Don't Feel",
                                       `Don't feel` = "Don't Feel",
                                       `Agree` = "Feel", 
                                       `Agree somewhat` = "Feel",
                                       `Agree strongly` = "Feel",
                                       `Disagree` = "Don't Feel",
                                       `Disagree somewhat` = "Don't Feel",
                                       `Disagree strongly` = "Don't Feel",
                                       `Do feel` = "Feel",
                                       `Do not feel` = "Don't Feel",
                                       `don t feel` = "Don't Feel",
                                       `not sure` = "Not Sure",
                                       # `Refused` = "NA",
                                       `Completely agree` = "Feel",
                                       `Completely disagree` = "Don't Feel",
                                       #`Don't know/Refused` = "NA",
                                       `Mostly agree` = "Feel",
                                       `Mostly disagree` = "Don't Feel",
                                       `Don't know/Refused (VOL.)` = "Don't know/Refused",
                                       `No answer/refused` = "Refused",
                                       `No Answer/Refused` = "Refused",
                                       `Refuse` = "Refused",
                                       `Decline to answer` = "Refused",
                                       `No, don’t feel this way` = "Don't Feel",
                                       `Yes, feel this way` = "Feel",
                                       `No, don't feel this way` = "Don't Feel",
                                       `Not Feel` = "Don't Feel")


## inequality variable 
## this indicates which version of the main variable 
## surveys had used 
## 1 - Harris survey, Feel/Don't Feel/Not Sure
## 2 - Harris survey, agree strongly/agree somewhat/disagree somewhat/
## disagree strongly/not sure (wasn't actualy available in any surveys)
## 3 - Harris, yes feel this way/no don't feel this way
## 4 - Harris, agree/don't agree/not sure
## 5 - Pew, slightly different wording of question and answers 
## discussed in SI

dataframe$inequality.variable <- dplyr::recode(dataframe$inequality.variable,
                                               `1` = "Harris_feel__don't_feel__not_sure",
                                               `3` = "Harris_yes_feel__no_don't_feel",
                                               `4` = "Harris_agree__don't_agree__not_sure",
                                               `5` = "Pew_version")

##  union 
dataframe$union.other <- dplyr::recode(dataframe$union.other,
                                       `Not sure` = "Not Sure",
                                       `No answer/refused` = "Don't know/Refused")
dataframe$union.self <- dplyr::recode(dataframe$union.self,
                                      `Not sure` = "Not Sure",
                                      `Don't know/Refused (VOL.)` = "Don't know/Refused",
                                      `No answer/refused` = "Don't know/Refused")

## party 
## here we don't collapse the different non-response categories
## becaues they varied across surveys 
dataframe$party <- dplyr::recode(dataframe$party,
                                 `Can t recall` = "Not Sure",
                                 `Decline to answer` = "Refused",
                                 `Not sure` = "Not Sure",
                                 `not sure` = "Not Sure",
                                 `Other (specify)` = "Other",
                                 `other (specify)` = "Other",
                                 `Other (vol)` = "Other",
                                 `Wallace party (American Indpendent)` = "Other",
                                 `Other (vol )` = "Other",
                                 `Other party` = "Other",
                                 `Other (vol.)` = "Other",
                                 `Other party (VOL.)` = "Other",
                                 `Don't know/Refused (VOL.)` = "Don't know/Refused",
                                 `No preference (VOL.)` = "No preference",
                                 `Nor sure` = "Not Sure")


## PARTYLN - this variable is only in the Pew data
## if respondents to party answered "Independent", "No Preference",
## "other", "don't know/refused" to the party question
## they were asked whether they leaned republican or democrat


### harmonizing ideology
### one note is that a few surveys 
## included the option "radical" - have coded these as "Liberal" 
## collapsed to liberal, moderate, conservative 

dataframe$ideology <- dplyr::recode(dataframe$ideology,
                                    `Decline to answer` = "Refused",
                                    `Left wing` = "Liberal",
                                    `Middle of road` = "Moderate",
                                    `Middle of the road` = "Moderate",
                                    `Middle-of-the-road` = "Moderate",
                                    `Middle-Of-the-Road` = "moderate",
                                    `Not Sure` = "Not Sure",
                                    #`Radical` = "Liberal",
                                    `Right wing` = "Conservative",
                                    `moderate` = "Moderate",
                                    `Not sure` = "Not Sure",
                                    `Liberal, OR` = "Liberal",
                                    `very Conservative` = "Conservative",
                                    `Very Conservative` = "Conservative",
                                    `Very Liberal` = "Liberal",
                                    `Very liberal?` = "Liberal",
                                    `(VOL. DO NOT READ) Don't know/Refused` = "Don't know/Refused",
                                    `moderate` = "Moderate",
                                    `Very conservative` = "Conservative",
                                    `Very liberal` = "Liberal",
                                    `Radical` = "Liberal")
dataframe$ideology <- dplyr::recode(dataframe$ideology,
                                    `moderate` = "Moderate",
                                    `[VOL. DO NOT READ] Don't know/Refused` = "Don't know/Refused",
                                    `Liberal [OR]` = "Liberal",
                                    `Very Cons` = "Conservative")


## education 
dataframe$educ <- dplyr::recode(dataframe$educ,
                                `1-7 years completed` = "Less than high school",
                                `1st through 7th grade` = "Less than high school",
                                `2 yr college graduate` = "Some college",
                                `2-year college grad (community, etc )` = "Some college",
                                `2-year college graduate` = "Some college",
                                `2-yr cllg grdt (cmmnty cllg, tc )` = "Some college",
                                `4-year college graduate` = "College graduate",
                                `4th grade or less` = "Less than high school",
                                `5th or 6th grade` = "Less than high school",
                                `6th grade or less` = "Less than high school",
                                `7th or 8th grade` = "Less than high school",
                                `8 years completed` = "Less than high school",
                                `8th grade` = "Less than high school",
                                `8th grade (8 years of school completed)` = "Less than high school",
                                `8th grade completed` = "Less than high school",
                                `8th grade or less` = "Less than high school",
                                `8th grd (8 yrs f schl cmpltd)` = "Less than high school",
                                `9th-11th grade`= "Less than high school",
                                `Associates degree` = "Some college",
                                `Bachelors degree` = "College graduate",
                                `C No formal schooling (0 years)` = "Less than high school",
                                `college graduate` = "College graduate",
                                `College graduate` = "College graduate",
                                `Completed post graduate` = "Post graduate",
                                `Completed some high school` = "Less than high school",
                                `D Frst thrgh svnth grd (1-7 yrs f schl c` = "Less than high school",
                                #`Don't know` = "NA",
                                `E 8th grd (8 yrs f schl cmpltd)` = "Less than high school",
                                `F Sm hgh schl (9-11 yrs f schl cmpltd)` = "Less than high school",
                                `Finished high school` = "High school graduate",
                                `First through 7th grade` = "Less than high school",
                                `First through seventh grade (1-7 years completed)` = "Less than high school",
                                `First through seventh grade (1-7 years of school completed)` = "Less than high school",
                                `Four year college graduate` = "College graduate",
                                `Four year college graduate (completed 4 years of college)` = "College graduate",
                                `Four year College graduate (completed 4 years of college)` = "College graduate",
                                `Fr yr cllg grdt (cmpltd 4 yrs f cllg)` = "College graduate",
                                `Frst thrgh 7th grd (1-7 yrs cmpltd)` = "Less than high school",
                                `Frst thrgh 7th grd (1-7 yrs f schl cmplt` = "Less than high school",
                                `Frst thrgh svnth grd (1-7 yrs f schl c` = "Less than high school",
                                `Frst thrgh svnth grd (1-7 yrs f schl cmp` = "Less than high school",
                                `G Hgh schl grdt (12 yrs f schl cmpltd)` = "High school graduate",
                                `H Sm cllg (1-3 yrs f cllg cmpltd)` = "Some college",
                                `Hgh schl grdt (12 yrs f schl cmpltd)`  = "High school graduate",
                                `High school diploma` = "High school graduate",
                                `high school graduate` = "High school graduate",
                                `High school graduate` = "High school graduate",
                                `High school graduate (12 years of scho` = "High school graduate",
                                `I Tw yr cllg grdt (cmpltd 2 yrs cmmnty c` = "Some college",
                                `J Fr yr cllg grdt (cmpltd 4 yrs f cllg)` = "College graduate",
                                `K Post graduate (4 year college graduate`  = "Post graduate",
                                `Less than high school` = "Less than high school",
                                `No formal education` = "Less than high school",
                                `No formal schooling` = "Less than high school",
                                `No formal schooling (0 years)` = "Less than high school",
                                `No formal schooling (O years)` = "Less than high school",
                                #`Not sure` = "NA",
                                `post graduate`  = "Post graduate",
                                `Post graduate`  = "Post graduate",
                                `Post graduate (4 year college graduate`  = "Post graduate",
                                `Post graduate (4 year college graduate a`  = "Post graduate",
                                `Post graduate (4 year college graduate and completed at least 1 year of graduate school)`  = "Post graduate",
                                `Post graduate (4 years college graduate`  = "Post graduate",
                                `Postgraduate`  = "Post graduate",
                                #`Refused` = "NA",
                                `Sm cllg (1-3 yrs f cllg cmpltd)` = "Some college",
                                `Sm hgh schl (9-11 yrs f schl cmpltd` = "Less than high school",
                                `Sm hgh schl (9-11 yrs f schl cmpltd)` = "Less than high school",
                                `some college` = "Some college",
                                `Some college` = "Some college",
                                `Some college (1-3 years of college completed)` = "Some college",
                                `Some college (1-3 years of school completed)` = "Some college",
                                `Some college (1-3 years)` = "Some college",
                                `Some graduate school`  = "Post graduate",
                                `some high school` = "Less than high school",
                                `Some high school` = "Less than high school",
                                `Some high school (9-11 years of school` = "Less than high school",
                                `some high school (9-11 years of school completed)` = "Less than high school",
                                `Some high school (9-11 years of school completed)` = "Less than high school",
                                `Some high school (9th-11th grade)` = "Less than high school",
                                `Trade/technical/vocational after high school` = "Some college",
                                `Trade/technical/vocational school after high school` = "Some college",
                                `Tw yr cllg grdt (cmpltd 2 yrs cmmnty c` = "Some college",
                                `Tw yr cllg grdt (cmpltd 2 yrs cmmnty cll` = "Some college",
                                `Two year college graduate` = "Some college",
                                `Two year college graduate (completed 2 year community college, etc)` = "Some college",
                                `Two year college graduate (completed 2 years community college, etc)` = "Some college",
                                `2-yr cllg grdt (cmmnty, tc )` = "Some college",
                                `7th-8th grade` = "Less than high school",
                                `High school graduate (12 years of school completed)` = "High school graduate",
                                `Business/Technical/Vocational school AFTER high school` = "Some college",
                                `High school incomplete` = "Less than high school",
                                `None, or grade 1-8` = "Less than high school",
                                `Some college, no 4-year degree` = "Some college",
                                `College graduate or higher` = "College graduate",
                                `Bachelors graduate` = "College graduate",
                                `Not Sure` = "Not sure",
                                `Don't know` = "Not sure",
                                `First through seventh grade` = "Less than high school",
                                `Post graduate, at least one year` = "Post graduate",
                                `NA` = "No answer/refused",
                                `Associate Degree` = "Some college",
                                `Associate's degree` = "Some college",
                                `Bachelor's degree (e.g., B.A., A.B., B.S.)` = "College graduate",
                                `College (such as B.A., B.S.)` = "College graduate",
                                `Completed College` = "College graduate",
                                `Completed high school` = "High school graduate",
                                `Completed some college` = "Some college",
                                `Completed some graduate school` = "Post graduate",
                                `Completed some graduate school (but no post-graduate degree)` = "Post graduate",
                                `Completed some high school (grades 9 - 11, 12 but no diploma` = "Less than high school",
                                `Decline to answer` = "No answer/refused",
                                `Don't know/Refused (VOL.)` = "No answer/refused",
                                `Four year college or university degree/Bachelor’s degree (e.g., BS, BA, AB)` = "College graduate",
                                `Graduate degree (such as MBA, MS, M.D., Ph.D.)` = "Post graduate",
                                `High school graduate (Grade 12 with diploma or GED certificate)` = "High school graduate",
                                `High school incomplete (Grades 9-11 or Grade 12 with NO diploma)` = "Less than high school",
                                `J.D.` = "Post graduate",
                                `Job-specific training program(s) after high school` = "Some college",
                                `Less than high school (Grades 1-8 or no formal schooling)` = "Less than high school",
                                `Less than high school (grades 1-8)` = "Less than high school",
                                `M.D.` = "Post graduate",
                                `MA, MS, MFA` = "Post graduate",
                                `MBA` = "Post graduate",
                                `Other graduate or professional degree` = "Post graduate",
                                `Ph.D., Psy.D. or other academic doctorate` = "Post graduate",
                                `Post-graduate degree` = "Post graduate",
                                `Postgraduate or professional degree, including master’s, doctorate, medical or law degree (e.g., MA, MS, PhD, MD, JD)` = "Post graduate",
                                `Some college, but no degree` = "Some college",
                                `Some college, no degree (includes community college)` = "Some college",
                                `Some graduate school, but no degree` = "Post graduate",
                                `Some postgraduate or professional schooling, no postgraduate degree` = "Post graduate",
                                `Two year associate degree from any college or university` = "Some college",
                                `Two year college graduate (completed 2 years community college, etc.)` = "Some college",
                                `8th or less` = "Less than high school",
                                `First through 7th grade (1-7 years of school completed)` = "Less than high school",
                                `H.S. Grad` = "High school graduate",
                                `Some College` = "Some college",
                                `Some H.S.` = "Less than high school",
                                `Tech/Trade` = "Some college",
                                `Some post-graduate training for an advanced degree` = "Post graduate",
                                `Two year degree` = "Some college",
                                `Some post-Grad` = "Post graduate",
                                `Post-Grad` = "Post graduate",
                                `Post-Grad degree` = "Post graduate",
                                `College Grad` = "College graduate",
                                `Grade 8` = "Less than high school",
                                `Grade 5-7` = "Less than high school",
                                `None` = "Less than high school",
                                `Grades 5-7` = "Less than high")



## religion harmonzing 
## religion - issue is "other" category - when did certain religions move out of "other"
## into being a chooseable category
## categories: protestant, catholic, jewish, other, none 
## jewish available as option in earliest surveys

dataframe$religion <- dplyr::recode(dataframe$religion,
                                    `Roman Catholic` = "Catholic",
                                    `Other (specify)` = "Other",
                                    `Other (write in)` = "Other",
                                    `none` = "None",
                                    `refused` = "Refused",
                                    `Eastern Orthodox` = "Other",
                                    `Not sre` = "Not sure",
                                    `Other (SPECIFY)` = "Other",
                                    #`Not sure/no answer/refused` = "Refused",
                                    `None (vol.)` = "None",
                                    `Agnostic` = "Other",
                                    `Atheist` = "None",
                                    `Buddhist` = "Other",
                                    `Christian` = "Protestant",
                                    `Church of Jesus Christ of Latter-Day Saints (LDS)/Mormon` = "Other",
                                    `Decline to answer` = "Refused",
                                    `Eastern/Greek Orthodox` = "Other",
                                    `Hindu` = "Other",
                                    `Jehovah's Witness` = "Other",
                                    `Moslem/Islam` = "Other",
                                    `Native American` = "Other",
                                    `Wiccan` = "Other",
                                    `Mormon` = "Other",
                                    `Orthodox Church (Greek or Russian)` = "Other",
                                    `Islam/Muslim` = "Other",
                                    `No religion, not a believer, atheist` = "None",
                                    `Agnostic (not sure if there is a God)` = "Other",
                                    `Atheist (do not believe in God)` = "None",
                                    `Christian (VOL.)` = "Protestant",
                                    `Don't Know/Refused (VOL.)` = "Don't know/Refused",
                                    `Jewish (Judaism)` = "Jewish",
                                    `Muslim (Islam)` = "Other",
                                    `Nothing in particular` = "None",
                                    `Orthodox Church (Greek, Russian, or some other orthodox church)` = "Other",
                                    `Something else` = "Other",
                                    `Unitarian (Universalist) (VOL.)` = "Other",
                                    `Not sure/no answer/refused` = "Don't know/Refused")

dataframe$religion <- dplyr::recode(dataframe$religion,
                                    `No religion, not a believer, atheist, agnostic` = "None",
                                    `Other religion` = "Other",
                                    `Orthodox Church` = "Other")

dataframe$religion <- dplyr::recode(dataframe$religion,
                                    `(Other) Christian` = "Protestant",
                                    `(Other) Protestant` = "Protestant",
                                    `Baptist` = "Protestant",
                                    `Don't know/Refused (VOL.)` = "Don't know/Refused",
                                    `Episcopalian` = "Protestant",
                                    `Lutheran` = "Protestant",
                                    `Methodist` = "Protestant",
                                    `Mormon (Church of Jesus Christ of Latter-day Saints/LDS)` = "Other",
                                    `Orthodox (Greek, Russian, or some other orthodox church)` = "Other",
                                    `Presbyterian` = "Protestant",
                                    `Protestant (Baptist, Methodist, Non-denominational, Lutheran, Presbyterian, etc.)` = "Protestant",
                                    `Roman Catholic (Catholic)` = "Catholic",
                                    `Something else (VOL./SPECIFY)` = "Other",
                                    `Other(SPECIFY)` = "Other",
                                    `Islam` = "Other",
                                    `Orthodox` = "Other",
                                    `No/None` = "None")

## race harmonziing  - race2 
## creating - binary for non-hispanic white or not 
## (because in earlier surveys did not make a distinction between ethnicity and race)

dataframe$race2 <- dplyr::recode(dataframe$race,
                                 `White` = "Yes",
                                 `Negro` = "No",
                                 `Puerto Rican` = "No",
                                 `Other (specify)` = "No",
                                 `Oriental` = "No",
                                 `Mexican-American` = "No", 
                                 `puerto Rican` = "No",
                                 `Other specify` = "No",
                                 `WHITE` = "Yes",
                                 `NEGRO` = "No",
                                 `PUERTO RICAN` = "No",
                                 `ORIENTAL` = "No",
                                 `OTHER` = "No",
                                 `Negro, black` = "No",
                                 `Other (SPECIFY)` = "No",
                                 `Negro-Black` = "No",
                                 `American Indian` = "No",
                                 `Black` = "No",
                                 `Mexican-American` = "No",
                                 `Spnsh-mrcn (Prt Rcn, Mxcn- mrcn)` = "No",
                                 `Spnsh-mrcn (Prt Rcn, Mxcn-mrcn)` = "No",
                                 `Spanish-American (Puerto Rican, Mexican-American)` = "No",
                                 `Spnsh-mrcn (Prt Rcn, Mxcn- mrcn, tc )` = "No",
                                 `Spnsh-mrcn (Prt Rcn, Mxcn-mrcn, tc )` = "No",
                                 `Spanish-American (Puerto Rican, Mexican-American, etc.)` = "No",
                                 `Amerian Indian or Alaskan native` = "No",
                                 `Asian (Oriental) or Pacific Islander` = "No",
                                 `Black, but not Hispanic` = "No",
                                 `Spanish-American (Mexican, Cuban,Puerto Rican, Central or South American` = "No",
                                 `White, but not Hispanic` = "Yes",
                                 `American Indian or Alaskan native` = "No",
                                 `Spanish-American (Mexican, Cuban, Puerto Rican, Central or South American` = "No",
                                 `Oriental/Asian or Pacific Islander` = "No",
                                 `Oriental, Asian or Pacific Islander` = "No",
                                 `Asian or Pacific Islander` = "No",
                                 `African-American` = "No",
                                 `Some other race` = "No",
                                 `Mixed race` = "No",
                                 `African American` = "No",
                                 `Native American or Alaskan native` =  "No",
                                 `Mexican-American` = "No",
                                 `Spanish-American (Mexican, Cuban,\n Puerto Rican, Central or South American` = "No")

dataframe$race2 <- dplyr::recode(dataframe$race2,
                                 `Mexican-Amerian` = "No",
                                 `Spanish-American (Mexican, Cuban,\n                                    Puerto Rican, Central or South American` = 
                                   "No",
                                 `Hispanic White` = "No",
                                 `Non-Hispanic White` = "Yes",
                                 `Non-White` = "No",
                                 `Decline to answer` = "Not sure/refused",
                                 `Don't know/refused` = "Not sure/refused")

dataframe$race2 <- dplyr::recode(dataframe$race2,
                                 `Decline to Answer` = "Not sure/refused",
                                 `Decline to answer/not sure` = "Not sure/refused",
                                 `Decline/not sure` = "Not sure/refused",
                                 `Don't know/Refused (VOL.)` = "Not sure/refused",
                                 `Hispanic` = "No",
                                 `Mixed Race` = "No",
                                 `Mixed racial background` = "No",
                                 `Native American or Alaskan Native` = "No",
                                 `Non-white` = "No",
                                 `Not sure/refuse` = "Not sure/refused",
                                 `Other race` = "No",
                                 `Not sure` = "Not sure/refused",
                                 `Not Sure` = "Not sure/refused",
                                 `Other (Specify)` = "No",
                                 `Refused` = "Not sure/refused")


## household size - hhsize
#harmonizing  - collapsing into 1, 2, 3, 4, 5, over 6  
## 99 is probably NA, lots of weird values (56) - just classifying as over 6
 
dataframe$hhsize <- dplyr::recode(dataframe$hhsize,
                                  `Over 6 (specify)` = "Over 6",
                                  `7` = "Over 6",
                                  `8` = "Over 6",
                                  `9` = "Over 6",
                                  `10` = "Over 6",
                                  `11` = "Over 6",
                                  `12` = "Over 6",
                                  `19` = "Over 6",
                                  `13` = "Over 6",
                                  `14` = "Over 6",
                                  `17` = "Over 6",
                                  `18` = "Over 6",
                                  `20` = "Over 6",
                                  `21` = "Over 6",
                                  `23` = "Over 6",
                                  `31` = "Over 6",
                                  `32` = "Over 6",
                                  `43` = "Over 6",
                                  `56` = "Over 6",
                                  `60` = "Over 6",
                                  `15` = "Over 6",
                                  `16` = "Over 6",
                                  `35` = "Over 6",
                                  `8 or more` = "Over 6",
                                  `99` = "NA",
                                  `0` = "1")
dataframe$hhsize[dataframe$hhsize == "NA" & !is.na(dataframe$hhsize)] <- NA
dataframe$hhsize[dataframe$hhsize == "Don't know/Refused" &
                   !is.na(dataframe$hhsize)] <- NA
dataframe$hhsize <- factor(dataframe$hhsize, levels = c("1", "2", "3", "4", "5", "6", "Over 6"),
                           labels = c("1", "2", "3", "4", "5", "6", "Over 6"), ordered = TRUE)


###########################
### Age Harmonizing #######
##########################


## age harmonizing 
## age was included differently across different surveys
## typically as different open and closed ended categories
## we need respondent age to estimate the year of birth
## for each respondent

## to this we estimate a numeric age for each respondent
## for respondents whose age was included in bounded 
## categories, we draw from uniform distribution between values
## for respondents whose age was included in unbounded categories 
## (i.e. 65 plus) - we draw from a pareto distribution  
## whose parameters we estimate from the observed data
## (for that year)
## between minimum age value (e.g. 65) and 100 

## age estimation process:
## 1: impute values from uniform distribution for bounded categories
## 2 for open categories, use maximum likelihood estimation to estimate shape parameter for
## pareto distribution parameters 
## ## random draws from said distribution, bounded at upper bound

## Pew top coded 97 according to its
## survey guide: 
dataframe$age[!is.na(dataframe$age) &
                dataframe$age == "97" &
                dataframe$study %in% pew.studies] <- "97 and older"


### bounded age function
### we run this function to get a numeric
## estimate for all closed category ages
age.function.bounded <- function(age){
  newage <- age
  if(!is.na(age)){
    if(age == "21-29"){
      newage <- runif(1, min = 21, max = 29)
    } else if(age == "21 to 24" | 
              age == "21-24"){
      newage <- runif(1, min = 21, max = 24)
    } else if(age == "25 to 29" |
              age == "25-29"){
      newage <- runif(1, min = 25, max = 29)
    } else if(age == "21 to 34" |
              age == "21-34"){
      newage <- runif(1, min = 21, max = 34)
    } else if(age == "30-34" |
              age == "30 to 34"){
      newage <- runif(1, min = 30, max = 34)
    } else if(age == "35-49" |
              age == "35 to 49"){
      newage <- runif(1, min = 35, max = 49)
    } else if(age == "35 to 44"){
      newage <- runif(1, min = 35, max = 44)
    } else if (age == "45 to 49" |
               age == "45-49"){
      newage <- runif(1, min = 45, max = 49)
    } else if(age == "40-49" |
              age == "40 to 49"){
      newage <- runif(1, min = 40, max = 49)
    } else if(age == "35 to 39" |
              age == "35-39"){
      newage <- runif(1, min = 35, max = 39)
    } else if(age == "50-64" |
              age == "50 to 64"){
      newage <- runif(1, min = 50, max = 64)
    } else if(age == "55-59"){
      newage <- runif(1, min = 55, max = 59)
    } else if(age == "18-20" |
              age == "18 to 20"){
      newage <- runif(1, min = 18, max = 20)
    } else if(age == "16 to 17"){
      newage <- runif(1, min = 16, max = 17)
    } else if(age == "16 to 18"){
      newage <- runif(1, min = 16, max = 18)
    } else if(age == "19 to 20"){
      newage <- runif(1, min = 19, max = 20)
    } else if(age == "18-24"){
      newage <- runif(1, min = 18, max = 24)
    } else if(age == "60-64"){
      newage <- runif(1, min = 60, max = 64)
    } else if(age == "50-54"){
      newage <- runif(1, min = 50, max = 54)
    } else if(age == "65-69"){
      newage <- runif(1, min = 65, max = 69)
    } else if(age == "40-44" |
              age == "40 to 44"){
      newage <- runif(1, min = 40, max = 44)
    } else if(age == "65 to 74"){
      newage <- runif(1, min = 64, max = 74)
    } else if(age == "16-20"){
      newage <- runif(1, min = 16, max = 20)
    } else if(age == "21 to 29"){
      newage <- runif(1, min = 21, max = 29)
    }
  }
  return(newage)
}

set.seed(08540)
dataframe$age2 <- NA
for(i in 1:nrow(dataframe)){
  dataframe$age2[i] <- age.function.bounded(dataframe$age[i])
}

## MLE estimation of parameters of pareto distribution given
## data 
pareto.MLE <- function(X){
  n <- length(X)
  m <- min(X)
  a <- n/sum(log(X)-log(m))
  return( c(m,a) ) 
}

## implementing for data - this function
## take top coded data and does a random draw
## between the top code and 100 from the empirical pareto
## pareto function for age 
openended.fct <- function(age, estimations){
  if(!is.na(age)){
    if(age == "65 and over" |
       age == "65 or older" |
       age == "65 plus"){
      ## estimate quantiles for my pareto distribution
      ## (given shape and scale
      ## previously estimated) for the bounds
      quantiles <- VGAM::ppareto(c(65, 100), 
                                 scale = estimations[1],
                                 shape = estimations[2], 
                                 lower.tail = TRUE, 
                                 log.p = FALSE)
      ## draw a random number between 
      ## those bounds of the quantiles 
      ## from a uniform distribuiton 
      number <- runif(1, quantiles[1], quantiles[2])  
      ## use the inverse CDF to get to the value for that quantile
      ## in my estimated pareto distribution 
      age <- qpareto(number, scale = estimations[1], 
                     shape = estimations[2])
    } else if(age == "70 and over"){
      quantiles <- VGAM::ppareto(c(70, 100), 
                                 scale = estimations[1],
                                 shape = estimations[2], 
                                 lower.tail = TRUE, 
                                 log.p = FALSE)
      number <- runif(1, quantiles[1], quantiles[2])  
      age <- qpareto(number, scale = estimations[1], 
                     shape = estimations[2])
    } else if(age == "75 and over"){
      quantiles <- VGAM::ppareto(c(75, 100), 
                                 scale = estimations[1],
                                 shape = estimations[2], 
                                 lower.tail = TRUE, 
                                 log.p = FALSE)
      number <- runif(1, quantiles[1], quantiles[2])  
      age <- qpareto(number, scale = estimations[1], 
                     shape = estimations[2])
    } else if(age == "97 and older" |
              age == "97 or older"){
      quantiles <- VGAM::ppareto(c(97, 100), 
                                 scale = estimations[1],
                                 shape = estimations[2], 
                                 lower.tail = TRUE, 
                                 log.p = FALSE)
      number <- runif(1, quantiles[1], quantiles[2])  
      age <- qpareto(number, scale = estimations[1], 
                     shape = estimations[2])
    }
  }
  return(age)
}


## top code data do that I can estimate
## pareto distribution
dataframe$age.est <- dplyr::recode(dataframe$age2,
                         "65 and over" = "65",
                         "65 or older" = "65",
                         "65 plus" = "65",
                         "70 and over" = "70",
                         "75 and over" = "75",
                         "97 and older" = "97",
                         "97 or older" = "97")


## remove NAs - all have to be numeric
dataframe$age.est[dataframe$age.est == "Decline to Answer" | 
                    dataframe$age.est == "Not sure" | 
                    dataframe$age.est == "Don’t know/Refused (VOL.)" |
                    dataframe$age.est == "Refused" |
                    dataframe$age.est == "DK/Ref."] <- NA
dataframe$age.est <- as.numeric(dataframe$age.est)

years <- unique(dataframe$year)
for(i in 1:length(years)){
  age.est <- dataframe$age.est[dataframe$year == years[i] &
                                 !is.na(dataframe$age.est)]
  ### estimate parameters of pareto distribution
  ### for that year
  estimations <- pareto.MLE(age.est)
  index <- which(dataframe$year == years[i])
  for(j in 1:length(index)){
    dataframe$age2[index[j]] <- openended.fct(dataframe$age2[index[j]],
                                              estimations = estimations)
  }
}

dataframe$age2[dataframe$age2 == "Refused" |
                 dataframe$age2 == "Not sure" |
                 dataframe$age2 == "Don’t know/Refused (VOL.)" |
                 dataframe$age2 == "Decline to Answer" |
                 dataframe$age2 == "DK/Ref."] <- NA
dataframe$age2 <- as.numeric(dataframe$age2)

## year of birth
dataframe$birth <- as.numeric(dataframe$year) - dataframe$age2


############################
#### Income Harmonizing ####
############################


## income 
## plan for income: same as for age2, 
## estimate empirical distribution for each survey wave 
#using uniform draws between bounded categories and 
#top coding the open ended data, 
#using empirical distribution to estimate pareto distribution parameters 
#then for top coded data do a random draw from the estimated pareto distribution,
#bounded between top code and 3x top code

## for bottom coded data - just did random draw between 0 
## and the bottom code 

dataframe$income <- dplyr::recode(dataframe$income, 
                                  `Don't know/Refused (1999 ONLY: Both 9 & 99 DK/Ref.)` = "Don't know/Refused")

## random uniform draws for income with top and bottom bounded
income.fct.bounded <- function(income){
  if(!is.na(income)){
    if(income == "$2,000-$2,999" |
       income == "$2,000 - $2,999" |
       income == "2K-2,999"){
      income <- runif(1, min = 2000, max = 2999)
    } else if(income == "B $3,000 - $4,999" | 
              income == "B $3,000-$4,999" |
              income == "B $3,000 to $4,999" |
              income == "$3,000-$4,999" |
              income == "$3,000 to 4,999" |
              income == "$3,000 to $4,999"){
      income <- runif(1, min = 3000, max = 4999)
    } else if(income == "$3,000 - $3,999" |
              income == "3K-3,999"){
      income <- runif(1, min = 3000, max = 3999)
    } else if(income == "$4,000 - $4,999" |
              income == "4K-4,999"){
      income <- runif(1, min = 4000, max = 4999)
    } else if(income == "$5,000 - $5,999" |
              income == "5K-5,999"){
      income <- runif(1, min = 5000, max = 5999)
    } else if(income == "C $5,000 - $6,999" |
              income == "C $5,000-$6,999" |
              income == "C $5,000 to $6,999" |
              income == "$5,000-$6,999" |
              income == "$5,000 to 6,999" |
              income == "$5,000 to $6,999"){
      income <- runif(1, min = 5000, max = 6999)
    } else if(income == "$5,000-$7,499"){
      income <- runif(1, min = 5000, max = 7499)
    }else if(income == "$6,000 - $6,999" |
             income == "6K-6,999"){
      income <- runif(1, min = 6000, max = 6999)
    }else if(income == "D $7,000 - $9,999" |
             income == "D $7,000- $9,999" |
             income == "D $7,000 to $9,999" |
             income == "$7,000 - $9,999" |
             income == "$7,000-$9,999" |
             income == "$7,000 to 9,999" |
             income == "$7,000 to $9,999" |
             income == "7K-9,999"){
      income <- runif(1, min = 7000, max = 9999)
    } else if(income == "$7,500-$9,999" |
              income == "$7,500-$9,999"){
      income <- runif(1, min = 7500, max = 9999)
    } else if(income == "$7,501 to $15,000" |
              income == "7,501 to 15,000"){
      income <- runif(1, min = 7501, max = 15000)
    } else if(income == "$10,000 - $11,999" |
              income == "10K-11,999"){
      income <- runif(1, min = 10000, max = 11999)
    } else if(income == "$10,000 - $14,999" |   
              income == "E $10,000 - $14,999" |
              income == "E $10,000-$14,999" |
              income == "E $10,000 to $14,999" |
              income == "$10,000- $14,999" |
              income == "$10,000 to 14,999" |
              income == "$10,000 to $14,999" |
              income == "10,000 to $14,999" |
              income == "$10,000-$14,999" |
              income == "E $10,000 to" |
              income == "14,999" |
              income == "10K-14,999"){
      income <- runif(1, min = 10000, max = 14999)
    } else if(income == "$10,000 to under $20,000" |
              income == "10 to under $20,000" |
              income == "10K-19,999"){  
      income <- runif(1, min = 10000, max = 20000)
    } else if(income == "$12,000 - $14,999" |
              income == "12K-14,999"){
      income <- runif(1, min = 12000, max = 14999)
    } else if(income == "F $15,000 - $19,999" |
              income == "F $15,000-$19,999" |
              income == "F $15,000 to $19,999" |
              income == "$15,000-$19,999" |
              income == "$15,000 - $19,999" |
              income == "$15,000 to 19,999" |
              income == "$15,000 to $19,999" |
              income == "15,000 to $19,999" |
              income == "15K-19,999"){
      income <- runif(1, min = 15000, max = 19999)
    }else if(income == "$15,001 to $25,000" |
             income == "15,001 to 25,000" |
             income == "$15,000 to $24,999" |
             income == "$15,000 to less than $25,000"){
      income <- runif(1, min = 15001, max = 25000)
    } else if(income == "G $20,000 - $24,999" |
              income == "G $20,000-$24,999" |
              income == "G $20,000 to $24,999" |
              income == "$20,000-$24,999" |
              income == "$20,000 to 24,999" |
              income == "$20,000 to $24,999" |
              income == "20,000 to $24,999" |
              income == "$20,000 - $24,999" |
              income == "20K-24,999"){
      income <- runif(1, min = 20000, max = 24999)
    }else if(income == "$20,000 - $29,999" |
             income == "$20,000 to under $30,000" |
             income == "20 to under $30,000" |
             income == "20K-29,999"){
      income <- runif(1, min = 20000, max = 29999)
    } else if(income == "Not Sure, Refused" |
              income == "Not sure/refused" |
              income == "I Not sure" |
              income == "refused" |
              income == "No answer/refused" |
              income == "Not sure" |
              income == "Not sure/no answer/refused" |
              income == "Refused" |
              income == "Decline to answer" |
              income == "Don't know/Refused" |
              income == "(VOL. DO NOT READ) Don't know/Refused" |
              income == "Don't know/Refused (VOL.)" |
              income == "DK/Ref"){
      income <- NA 
    }else if(income == "$25,000 - $29,999" |
             income == "25K-29,999"){
      income <- runif(1, min = 25000, max = 29999)
    }else if(income == "$25,000 to $34,999" |
             income == "$25,000-$34,999" |
             income == "$25,001 to $35,000" |
             income == "25,001 to 35,000" |
             income == "$25,000 to less than $35,000"){
      income <- runif(1, min = 25000, max = 34999)
    }else if(income == "$35,000-$49,999" |
             income == "$35,000 to less than $50,000"){
      income <- runif(1, min = 35000, max = 49999)
    }else if(income == "$30,000 - $34,999" |
             income == "30K-34,999"){
      income <- runif(1, min = 30000, max = 34999)
    }else if(income == "$30,000 - $49,999" |
             income == "30K-49,999"){
      income <- runif(1, min = 30000, max = 49999)
    }else if(income == "$30,000 to under $40,000" |
             income == "30 to under $40,000" |
             income == "30K-39,999"){
      income <- runif(1, min = 30000, max = 40000)
    }else if(income == "$35,000 - $39,999" |
             income == "35K-39,999"){
      income <- runif(1, min = 35000, max = 39999)
    }else if(income == "$35,001 to $50,000" |
             income == "35,001 to 50,000" |
             income == "$35,000 to $49,999"){
      income <- runif(1, min = 35000, max = 50000)
    }else if(income == "$40,000 - $49,999" |
             income == "$40,000 to under $50,000" |
             income == "40 to under $50,000" |
             income == "40K-49,999"){
      income <- runif(1, min = 40000, max = 49999)
    }else if(income == "50,001 to 75,000" |
             income == "$50,000 - $74,999" |
             income == "$50,000 to under $75,000" |
             income == "$50,001 to $75,000" |
             income == "$50,000 to $74,999" |
             income == "50 to under $75,000" |
             income == "$50,000 to less than $75,000" |
             income == "50-74,999" |
             income == "50k-74,999"){
      income <- runif(1, min = 50001, max = 75000)
    }else if(income == "$50,000 - $99,999" |
             income == "50K-99,999"){
      income <- runif(1, min = 50000, max = 99999)
    }else if(income == "75,001 to 100,000" |
             income == "$75,001 to $100,000" |
             income == "$75,000 to $99,999" |
             income == "$75,000 - $99,999" |
             income == "$75,000 to under $100,000" |
             income == "75 to under $100,000" |
             income == "$75,000 to less than $100,000" |
             income == "75k-99,999"){
      income <- runif(1, min = 75000, max = 100000)
    }else if(income == "$100,000 to $124,999" |
             income == "$100,000 to less than $125,000"){
      income <- runif(1, min = 100000, max = 124999)
    }else if(income == "100 to under $150,000" |
             income == "100-149,999"){
      income <- runif(1, min = 100000, max = 150000 )
    }else if(income == "$125,000 to $149,999" |
             income == "$125,000 to less than $150,000"){
      income <- runif(1, min = 125000, max = 149999)
    }else if(income == "$150,000 to $199,999"){
      income <- runif(1, min = 150000, max = 199999)
    }else if(income == "$150,000 to $199,999" |
             income == "$150,000 to less than $200,000"){
      income <- runif(1, min = 150000, max = 199999)
    }else if(income == "$200,000 to $249,999" |
             income == "$200,000 to less than $250,000"){
      income <- runif(1, min = 200000, max = 249999)
    }else if(income == "Less than $2,000" |
             income == "Under $2,000" |
             income == "< 2,000"){
      income <- runif(1, min = 0, max = 2000)
    }else if(income == "A Under $3,000" |
             income == "Under $3,000" |
             income == "Under $3000"){
      income <- runif(1, min = 0, max = 3000)
    }else if(income == "Under $5,000"){
      income <- runif(1, min = 0, max = 5000)
    }else if(income == "$7,500 or less" |
             income == "Under $7500" |
             income == "7,500 or less"){
      income <- runif(1, min = 0, max = 7500)
    }else if(income == "Less than $10,000" |
             income == "< 10,000"){
      income <- runif(1, min = 0, max = 10000)
    }else if(income == "$0 to $14,999" |
             income == "$14,999 or less" |
             income == "Under $15,000" |
             income == "Less than $15,000" |
             income == "< 15,000"){
      income <- runif(1, min = 0, max = 15000)
    }else if(income == "H $25,000 and over" |
             income == "$25,000 and over" |
             income == "25,000 and over"){
      income <- 25000
    }else if(income == "$35,000 and over"){
      income <- 35000
    }else if(income == "$50,000 or more" |
             income == "$50,001 or over" |
             income == "$50,001 and over" |
             income == "$50,000 and over" |
             income == "50,000 +"){
      income <- 50000 
    }else if(income == "100,001 and over" |
             income == "$100,001 or over" |
             income == "$100,000 or more" |
             income == "100,000 +"){
      income <- 100000
    }else if(income == "$150,000 or more" |
             income == "150,000+"){
      income <- 150000
    }else if(income == "$250,000 or more"){
      income <- 250000
    }
  }
  return(income)
}

set.seed(08540)
dataframe$income.2 <- NA
for(i in 1:nrow(dataframe)){
  dataframe$income.2[i] <- income.fct.bounded(dataframe$income[i])
}


## checking to make sure all income categories converted
check.table <- table(dataframe$income.2)
names(check.table[check.table > 1])

table(dataframe$income[dataframe$income.2 == 35000])
dataframe$income.2 <- as.numeric(dataframe$income.2)

dataframe$income.2[dataframe$income.2 == 1e+05 &
                     !is.na(dataframe$income.2)] <- 100000

## open ended categories we need to draw for:
## 25000 - greater than
## 100000  - geater than
## 250000 - greater than
## 35000 - greater than
## 50000 - greater than

## use pareto.MLE function from above
## to estimate parameters 

## estimate pareto parametrs by year
## also find top value by year

years <- unique(dataframe$year)
years <- as.data.frame(years)
years$pareto1 <- NA
years$pareto2 <- NA


for(i in 1:nrow(years)){
  income.est <- dataframe$income.2[dataframe$year == years[i,1]]
  income.est <- income.est[!is.na(income.est)]
  income.est <- as.numeric(income.est)
  years[i,c(2,3)] <- pareto.MLE(income.est)
}


topcodes <- names(table(dataframe$income.2)[table(dataframe$income.2) > 2])
topcodes <- c(100000, topcodes)


## now that I have pareto distribution parameters estimated for my 
## distribution, need to do random draws 

openended.fct <- function(income, year){
  frame <- years[years$years == year, ]
  if(!is.na(income)){
    if(income %in% topcodes){
      quantiles <- VGAM::ppareto(c(income, income * 3), 
                                 scale = frame$pareto1,
                                 shape = frame$pareto2, 
                                 lower.tail = TRUE,  
                                 log.p = FALSE)
      num <- runif(1, quantiles[1], quantiles[2])  
      income <- qpareto(num, scale = frame$pareto1, 
                        shape = frame$pareto2)
    }
  }
  return(income)
}

## checking function 
openended.fct(income= 25000, year = years$years[1])
openended.fct(income= 13000, year = years$years[1])
openended.fct(income= NA, year = years$years[1])
openended.fct(income= 100000, year = years$years[1])

## what's happening here: 
## step 1- (already did above) - I found the empirical distribution for each year
## based on all the surveys from that year, with open ended categories top coded and
## data from bounded categories a random uniform draw from between the bounds 
## bottom coded data (i.e. 2000 or less) I found using a random draw between 0 and the bottom code
## step 2 - (already did above) - I found the shape and scale parameters for the pareto distribution
## from that empirical distribution, then took the max value by year (i.e. max top code for that year) and
## then multiplied that by three to find the max draw for that year
## top3x is thus defined by year, not by survey 
## step 3 openended.fct - for each value of open-ended data, 
## find the probability (quantiles) of distribution defined by scale and shape parameters for that year
## at the cutoffs I'm interested in (the value of that top coded observation and the max value for that year)
## step 4 - do a random draw from uniform distribution between those two quantiles (e.g. random 
## draw between .5 and .75)
## step 5 - go back to pareto distribution, for each open-ended observation find what value 
## that random probability draw corresponds to in the distribution (i.e. .75 in that distribution for that
## year becomes 132000)

## then use mapply
set.seed(08540)
dataframe$income.final <- mapply(openended.fct, 
                                 income = dataframe$income.2, 
                                 year = dataframe$year)


## deflate to dollars in 1966
cpi.function <- function(year, value){
  denom <- cpi[match(year, as.numeric(names(cpi)))]/32.4
  new.value <- value/denom
  return(new.value)
}
dataframe$income.final.deflated <- mapply(cpi.function, year = dataframe$year,
                                          value = dataframe$income.final)


#########################
### Harmonizing Alienation
### Index Variables ######

# dontcare dontcount leftout (harris) 
## pew.nosay pew.govtcare pew.finsat (pew)

dataframe$dontcare <- dplyr::recode(dataframe$dontcare,
                                    `Agree` = "Feel",
                                    `Agree somewhat` = "Don't Feel",
                                    `Agree strongly` = "Don't Feel",
                                    `Disagree` = "Don't Feel",
                                    `Disagree somewhat` = "Don't Feel",
                                    `Disagree strongly` = "Don't Feel",
                                    `don t feel` = "Don't Feel",
                                    `Don t feel` = "Don't Feel",
                                    `Don t Feel` = "Don't Feel",
                                    `Don't feel` = "Don't Feel",
                                    `No answer/refused` = "Refused",
                                    `No Answer/Refused` = "Refused",
                                    `not sure` = "Not Sure",
                                    `Not sure` = "Not Sure",
                                    `Do not feel` = "Don't Feel",
                                    `Do feel` = "Feel",
                                    `No, don't feel this way` = "Don't Feel",
                                    `No, don’t feel this way` = "Don't Feel",
                                    `Yes, feel this way` = "Feel",
                                    `Decline to answer` = "Refused")

dataframe$dontcount <- dplyr::recode(dataframe$dontcount,
                                     `Agree` = "Feel",
                                     `Agree somewhat` = "Don't Feel",
                                     `Agree strongly` = "Don't Feel",
                                     `Disagree` = "Don't Feel",
                                     `Disagree somewhat` = "Don't Feel",
                                     `Disagree strongly` = "Don't Feel",
                                     `don t feel` = "Don't Feel",
                                     `Don t feel` = "Don't Feel",
                                     `Don t Feel` = "Don't Feel",
                                     `Don't feel` = "Don't Feel",
                                     `No answer/refused` = "Refused",
                                     `No Answer/Refused` = "Refused",
                                     `not sure` = "Not Sure",
                                     `Not sure` = "Not Sure",
                                     `Do not feel` = "Don't Feel",
                                     `Do feel` = "Feel",
                                     `No, don't feel this way` = "Don't Feel",
                                     `No, don’t feel this way` = "Don't Feel",
                                     `Yes, feel this way` = "Feel",
                                     `Decline to answer` = "Refused")


dataframe$leftout <- dplyr::recode(dataframe$leftout,
                                   `Agree` = "Feel",
                                   `Agree somewhat` = "Don't Feel",
                                   `Agree strongly` = "Don't Feel",
                                   `Disagree` = "Don't Feel",
                                   `Disagree somewhat` = "Don't Feel",
                                   `Disagree strongly` = "Don't Feel",
                                   `don t feel` = "Don't Feel",
                                   `Don t feel` = "Don't Feel",
                                   `Don t Feel` = "Don't Feel",
                                   `Don't feel` = "Don't Feel",
                                   `No answer/refused` = "Refused",
                                   `No Answer/Refused` = "Refused",
                                   `not sure` = "Not Sure",
                                   `Not sure` = "Not Sure",
                                   `Do not feel` = "Don't Feel",
                                   `Do feel` = "Feel",
                                   `No, don't feel this way` = "Don't Feel",
                                   `No, don’t feel this way` = "Don't Feel",
                                   `Yes, feel this way` = "Feel",
                                   `Decline to answer` = "Refused")


### pew nosay
dataframe$pew.nosay <- dplyr::recode(dataframe$pew.nosay,
                                     `Completely Agree` = "Feel",
                                     `Completely Disagree` = "Don't Feel",
                                     `Don't know/Refused (VOL.)` = "Don't Know/Refused",
                                     `Don't Know/Refused (VOL.)` = "Don't Know/Refused",
                                     `Mostly Agree` = "Feel",
                                     `Mostly Disagree` = "Don't Feel",
                                     `No at all likely` = "Don't Feel",
                                     `Not too likely` = "Don't Feel",
                                     `Somewhat likely` = "Feel",
                                     `Very likely` = "Feel",
                                     `Completely agree` = "Feel",
                                     `Completely disagree` = "Don't Feel",
                                     `Mostly agree` = "Feel",
                                     `Mostly disagree` = "Don't Feel",
                                     `Don't know/Refused` = "Don't Know/Refused")

## pew finsat
dataframe$pew.finsat <- dplyr::recode(dataframe$pew.finsat,
                                      `Completely Agree` = "Feel",
                                      `Completely Disagree` = "Don't Feel",
                                      `Don't know/Refused (VOL.)` = "Don't Know/Refused",
                                      `Don't Know/Refused (VOL.)` = "Don't Know/Refused",
                                      `Mostly Agree` = "Feel",
                                      `Mostly Disagree` = "Don't Feel",
                                      `No at all likely` = "Don't Feel",
                                      `Not too likely` = "Don't Feel",
                                      `Somewhat likely` = "Feel",
                                      `Very likely` = "Feel",
                                      `Mostly agree` = "Feel",
                                      `Mostly disagree` = "Don't Feel",
                                      `Completely agree` = "Feel",
                                      `Completely disagree` = "Don't Feel",
                                      `Don't know/Refused` = "Don't Know/Refused")

## pew.govtcare

dataframe$pew.govtcare <- dplyr::recode(dataframe$pew.govtcare,
                                        `Completely Agree` = "Feel",
                                        `Completely Disagree` = "Don't Feel",
                                        `Don't know/Refused (VOL.)` = "Don't Know/Refused",
                                        `Don't Know/Refused (VOL.)` = "Don't Know/Refused",
                                        `Mostly Agree` = "Feel",
                                        `Mostly Disagree` = "Don't Feel",
                                        `No at all likely` = "Don't Feel",
                                        `Not too likely` = "Don't Feel",
                                        `Somewhat likely` = "Feel",
                                        `Very likely` = "Feel",
                                        `Mostly agree` = "Feel",
                                        `Mostly disagree` = "Don't Feel",
                                        `Completely agree` = "Feel",
                                        `Completely disagree` = "Don't Feel",
                                        `Don't know/Refused` = "Don't Know/Refused")


##### question_place
dataframe$question_place <- dplyr::recode(dataframe$question_place,
                                          `after_party` = "after party",
                                          `no_party` = "no party question")



test.2 <- dataframe[,c("study", "year", "urban", "region", "hh",
                       "inequality", "inequality_harm", "inequality.variable",
                       "union.self", "union.other", "party", "ideology",
                       "educ", "religion", "race", "race2", "hhsize",
                       "age", "age2", "birth",
                       "income.final", "income.final.deflated",
                       "dontcare", "dontcount", "leftout", "pew.nosay",
                       "pew.finsat", "pew.govtcare", "WEIGHT",
                       "state","PARTYLN",
                       "gender",
                       "question_place",
                       "RESPID",
                       "inequality.mid.poor",
                       "inequality.mid.rich",
                       "racial_gains",
                       "racial_discrimination")]

#saveRDS(test.2, file = "data_files/data_merged_4_29_2025.rds")

###########################
#### Imputation Plots #####
###########################

ggplot(dataframe, aes(x = age2)) +
  geom_histogram(binwidth=2,
                 fill="red") +
  ggtitle("Histogram of Imputed Age")  +
  theme_classic() +
  facet_wrap(~factor(year),
             scales = "free_y") +
  labs(x = "Age (Numeric Age and Random Draw from Categories)",
       y = "Count of Respondents")
##figures/age_imputation.pdf
## 5x8

ggplot(dataframe, aes(x = log(income.final.deflated))) +
  geom_density(fill = "red", alpha =.5) +
  theme_classic() +
  facet_wrap(~factor(year)) +
  labs(x = "Logged Numeric Household Income Deflated to 1966 Real Values",
       title = "Density Plots of Imputed Income",
       y = NULL)
## figures/income_imputation
